package com.bugslog.api.manager.impl;

import com.bugslog.api.manager.SpiderManager;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Service;

import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

@Service
public class SpiderManagerImpl implements SpiderManager{

    @Override
    public void analysis() {

    }

    public static void main(String[] args) {
        try {
            String url = "https://www.alexa.com/topsites/category/Regional/Asia/India/News_and_Media";
            Document doc = Jsoup.connect(url)
                    .userAgent("Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; MALC)")
                    .timeout(20000)
                    .get();
            StringBuffer sb = new StringBuffer();
            Element body = doc.body();
//            Element data = body.getElementById("fsb");
            Elements tbody = doc.getElementsByClass("listings");
            Elements trows = tbody.get(0).getElementsByClass("site-listing");

            Iterator<Element> it = trows.iterator();
            while (it.hasNext()) {
                Element trow = it.next();
                Elements troww = trow.getElementsByClass("trow-w");
                Elements tmpData = troww.get(0).getElementsByClass("data");
                Elements link = tmpData.get(0).getElementsByTag("a");
                sb.append(link.get(0).attr("href") + ",");
            }
//            List<Element> lists = bgbu.subList(2,bgbu.size()-1);

/*            for(Element a: lists) {
                Elements as = a.getElementsByTag("a");
                Iterator<Element> it = as.iterator();
                while(it.hasNext()) {
                    Element tmp = it.next();
                    sb.append(tmp.attr("href") + ",");
                }
            }*/
            System.out.print(sb.toString());
//            String data = doc.toString().toLowerCase();
        }catch(Exception e) {

        }
    }

    public static void main3(String[] args) {
        String newUrlEn = "http://us.cnn.com/,https://www.nytimes.com/,http://www.huffingtonpost.com/,http://www.foxnews.com/,https://www.usatoday.com/,http://www.reuters.com/,http://www.politico.com/,https://www.yahoo.com/news/,http://www.npr.org/,http://www.latimes.com/,http://www.nbcnews.com/,http://www.cbsnews.com/,http://nypost.com/,http://abcnews.go.com/,http://www.nydailynews.com/,http://www.breitbart.com/,http://www.denverpost.com/,http://newyork.cbslocal.com/,http://losangeles.cbslocal.com/,http://chicago.cbslocal.com/,http://dfw.cbslocal.com/,http://washington.cbslocal.com/,http://boston.cbslocal.com/,http://philadelphia.cbslocal.com/,http://minnesota.cbslocal.com/,http://sanfrancisco.cbslocal.com/,http://detroit.cbslocal.com/,http://tampa.cbslocal.com/,http://www.chicagotribune.com/,http://www.theonion.com/,http://www.newsmax.com/,http://www.washingtontimes.com/,https://www.boston.com/,http://www.newsweek.com/,http://www.mercurynews.com/,http://www.philly.com/,http://www.seattletimes.com/,http://www.miamiherald.com/news/,http://observer.com/,http://www.stltoday.com/,http://gothamist.com/,http://ktla.com/,http://www.seattlepi.com/,http://www.newsday.com/,http://chicago.suntimes.com/,http://www.laweekly.com/,http://abc13.com/,http://wtop.com/,http://www.bostonherald.com/,http://www.nbcnewyork.com/,http://wgntv.com/,http://abc7news.com/,http://www.autonews.com/,http://kdvr.com/,http://www.miaminewtimes.com/,http://www.twincities.com/,http://kxan.com/,http://www.nbcchicago.com/,http://www.nbcwashington.com/,http://fox2now.com/,http://brooklyn.news12.com/,http://www.nbclosangeles.com/,http://pix11.com/,http://www.phillyvoice.com/,http://www.villagevoice.com/,http://www.westword.com/,http://www.houstonpress.com/,http://kron4.com/,http://www.nbcphiladelphia.com/,http://www.dailyherald.com/,http://www.nbcsandiego.com/,http://www.nbcdfw.com/news/,http://www.phoenixnewtimes.com/,http://arlington.wickedlocal.com/,http://www.amny.com/,http://www.chicagobusiness.com/,http://www.pe.com/,http://www.wxyz.com/,http://whdh.com/,http://wfla.com/,http://fox5sandiego.com/,http://www.nbcmiami.com/,http://wsvn.com/news/,http://www.riverfronttimes.com/,http://www.abcactionnews.com/,http://wivb.com/,http://www.chicagoreader.com/,https://www.minnpost.com/,http://news10.com/,http://www.metrotimes.com/,https://www.texasobserver.org/,https://billypenn.com/,http://timesofsandiego.com/,http://www.nysun.com/,http://www.laobserved.com/,http://citylimits.org/,http://www.miamitodaynews.com/,http://kplr11.com/,http://atlantaintownpaper.com/,http://laindependent.com/";
        String url = "timesofindia.indiatimes.com/,ndtv.com/,indiatoday.intoday.in,indianexpress.com," +
                "thehindu.com,news18.com,firstpost.com,business-standard.com,dnaindia.com,deccanchronicle.com," +
                "oneindia.com,financialexpress.com,scroll.in,thehindubusinessline.com,thequint.com,outlookindia.com," +
                "freepressjournal.in,teluguglobal.in,newsx.com,asianage.com,dailyexcelsior.com,telanganatoday.news," +
                "chandigarhmetro.com,navhindtimes.in,risingkashmir.com,thesangaiexpress.com,nagpurtoday.in,arunachaltimes.in," +
                "kashmirreader.com,newstodaynet.com,news.statetimes.in,orissapost.com,mydigitalfc.com,starofmysore.com," +
                "emitpost.com,darjeelingtimes.com,thetimesofbengal.com,thenorthlines.com,thenewshimachal.com,goacom.com," +
                "bilkulonline.com,himtimes.com,asbanews.com,hindustantimes.com,indianexpress.com/,dailypioneer.com,deccanherald.com," +
                "telegraphindia.com,dnaindia.com,deccanchronicle.com,economictimes.indiatimes.com,business-standard.com," +
                "financialexpress.com,newindianexpress.com,livemint.com,tribuneindia.com,in.news.yahoo.com,zeenews.india.com," +
                "w3newspapers.com,centralchronicle.com,thehitavada.com,greaterkashmir.com,jehlumpost.com,dailypost.in," +
                "kashmirobserver.net,kashmirtimes.com,news.statetimes.in,impressivetimes.com,mailtoday.in,milligazette.com," +
                "organiser.org,peoplesdemocracy.in,sundayguardianlive.com,indianhorizon.org,epaper.navjammu.com,thehawk.in," +
                "assamtribune.com,poknapham.in,ifp.co.in,theshillongtimes.com,morungexpress.com,nagalandpost.com,nagalandpage.com," +
                "sentinelassam.com,sikkimexpress.com,dailydesherkatha.net,syandanpatrika.com,nongsain.com,net.glpublications.in," +
                "timesofassam.com,eclecticnortheast.in,bangaloremirror.indiatimes.com,thehansindia.com,ahmedabadmirror.indiatimes.com," +
                "afternoondc.in,afternoonvoice.com,epaperlokmat.in/lokmattimes/,mid-day.com,mumbaimirror.indiatimes.com,oheraldo.in," +
                "punemirror.indiatimes.com,accommodationtimes.com";
        String url111 = "http://timesofindia.indiatimes.com/,http://timesofindia.indiatimes.com/,http://www.thehindu.com/,http://www.thehindu.com/,http://www.hindustantimes.com/,http://www.hindustantimes.com/,http://indianexpress.com/,http://indianexpress.com/,http://www.dailypioneer.com/,http://www.dailypioneer.com/,http://www.deccanherald.com/,http://www.deccanherald.com/,https://www.telegraphindia.com/,https://www.telegraphindia.com/,http://www.dnaindia.com/,http://www.dnaindia.com/,http://www.deccanchronicle.com/,http://www.deccanchronicle.com/,http://www.asianage.com/,http://www.asianage.com/,/uk/,http://economictimes.indiatimes.com/,http://economictimes.indiatimes.com/,http://www.business-standard.com/,http://www.business-standard.com/,http://www.financialexpress.com/,http://www.financialexpress.com/,http://www.newindianexpress.com/,http://www.newindianexpress.com/,http://www.livemint.com/,http://www.livemint.com/,http://www.tribuneindia.com/,http://www.tribuneindia.com/,https://www.w3newspapers.com/pakistan/,http://www.andamanchronicle.net/,http://www.andamanchronicle.net/,http://echoofindia.com/,http://echoofindia.com/,http://www.andamansheekha.com/,http://www.orissapost.com/,http://www.orissapost.com/,http://www.thestatesman.com/,http://www.thestatesman.com/,http://epaper.himalayadarpan.com/,http://epaper.himalayadarpan.com/,/nepal/,http://www.centralchronicle.com/,http://www.centralchronicle.com/,http://www.dailyexcelsior.com/,http://www.dailyexcelsior.com/,http://www.greaterkashmir.com/,http://www.greaterkashmir.com/,http://www.thehitavada.com/,http://www.thehitavada.com/,http://www.jehlumpost.com/,http://www.jehlumpost.com/,https://dailypost.in/,https://kashmirobserver.net/,http://www.kashmirtimes.com/,http://news.statetimes.in/,http://www.impressivetimes.com/,http://www.mailtoday.in/,http://www.milligazette.com/,http://www.organiser.org/,http://peoplesdemocracy.in/,http://www.sundayguardianlive.com/,http://indianhorizon.org/,http://epaper.navjammu.com/,http://www.thehawk.in/,http://www.assamtribune.com/,http://www.assamtribune.com/,http://www.poknapham.in/,http://www.poknapham.in/,http://www.thesangaiexpress.com/,http://www.thesangaiexpress.com/,http://www.ifp.co.in/,http://www.ifp.co.in/,http://www.theshillongtimes.com/,http://www.theshillongtimes.com/,http://morungexpress.com/,http://morungexpress.com/,http://www.nagalandpost.com/,http://www.nagalandpage.com/,http://www.sentinelassam.com/,http://sikkimexpress.com/,http://www.dailydesherkatha.net/,http://www.syandanpatrika.com/,/bangladesh/,http://www.nongsain.com/,http://net.glpublications.in/,https://www.timesofassam.com/,https://eclecticnortheast.in/,http://bangaloremirror.indiatimes.com/,http://bangaloremirror.indiatimes.com/,http://www.thehindubusinessline.com/,http://www.thehindubusinessline.com/,https://www.newstodaynet.com/,https://www.newstodaynet.com/,https://starofmysore.com/,http://www.mangaloretoday.com,http://www.thehansindia.com/,http://www.deccanage.com/";
//        String url = "news18.com";
        String hindiUrl = "http://aajtak.intoday.in/,https://www.bhaskar.com/,https://www.patrika.com/,http://www.khaskhabar.com/," +
                "http://www.samachar.com/,https://navbharattimes.indiatimes.com/,http://hindi.webdunia.com/,https://hindi.news18.com/," +
                "https://hindi.oneindia.com/,http://www.dainiknavajyoti.com/hindi/,http://www.jansatta.com/,http://naidunia.jagran.com/," +
                "http://navbharattimes.indiatimes.com/,http://www.amarujala.com/,http://www.punjabkesari.in/,http://rajasthanpatrika.patrika.com/," +
                "http://www.bhaskar.com/,http://www.livehindustan.com/,http://www.jagran.com/,http://www.sumanasa.com/hindinews/,http://naiduniaepaper.jagran.com/," +
                "http://www.prabhatkhabar.com/,http://www.patrika.com/,http://www.webmilap.com/,http://www.deshbandhu.co.in/,https://www.patrika.com/rajasthan-news/," +
                "http://www.rashtriyasahara.com/,http://www.haribhoomi.com/,http://ranchiexpress.com/,http://epaperlokmat.in/lokmatsamachar/," +
                "http://hindi.economictimes.indiatimes.com/,http://loktej.com/,http://pratahkal.com/,http://www.prabhasakshi.com/,http://www.khaskhabar.com/," +
                "http://tehelkahindi.com/,http://www.virarjun.com/,http://www.uttamhindu.com/,http://www.deshdoot.com/,http://hindi.business-standard.com/," +
                "http://dainiksandhyaprakash.com/,http://sandhyapravakta.com/,http://dainiktribuneonline.com/,http://www.divyahimachal.com/,http://www.dailynews360.com/," +
                "http://www.sanjeevnitoday.com/,http://www.navabharat.com/,http://www.samacharjagat.com/,http://www.dainiksaveratimes.com/,http://www.panchjanya.com/," +
                "http://www.chauthiduniya.com/,http://www.swatantrabharat.com/,http://dainikpurvoday.com/,https://bharatkhabar.com/,http://www.dandakaranyasamachar.com/," +
                "http://www.epaper.sachkahoon.com/,http://www.dailynewsactivist.com/,http://pp.glpublications.in/,http://aapkikhabar.com/,http://rajexpress.co/," +
                "http://sachkaujala.com/,http://www.krishakjagat.org/,http://dabangdunia.co/,http://sandhyajyotidarpan.com/,http://www.agniban.com/," +
                "http://inextlive.jagran.com/,http://www.dailychhattisgarh.com/,http://sanmarg.in/,http://www.pradeshtoday.com/,http://www.pudhari.news/," +
                "http://jagruktimes.co.in/,http://www.hamaramahanagar.in/,http://epaper.swadesh.in/,http://www.aajkaanandpapers.com/,https://amarbharti.com/," +
                "http://epaper.viraatvaibhav.com/,http://deshonnati.digitaledition.in/,http://www.sarohabulletin.com/,http://highwaychannel.in/," +
                "http://www.tarunmitra.in/,http://www.jansandeshtimes.net/,http://www.awantika.com/,http://www.royalbulletin.com/,http://shahtimesnews.com/epaper/," +
                "http://epaper.thehawk.in/,http://parichaytimes.in/,http://naiduniaonline.com/,http://raagdesh.com/,http://vyaparkesari.com/,http://punjabkesari.com/";
        String hindiUrl2 = "https://hindi.news18.com/,https://hindi.news18.com/,http://hindi.webdunia.com/,http://hindi.webdunia.com/,/magazines/cricket/,https://hindi.oneindia.com/,https://hindi.oneindia.com/,http://www.dailyhindinews.com/,http://www.dailyhindinews.com/,http://zeenews.india.com/hindi/,http://zeenews.india.com/hindi/,http://abpnews.abplive.in/,http://abpnews.abplive.in/,http://www.bbc.com/hindi,http://www.bbc.com/hindi,https://news.google.co.in/?edchanged=1&ned=hi_in&authuser=0,https://news.google.co.in/?edchanged=1&ned=hi_in&authuser=0,http://www.inkhabar.com/,http://aajtak.intoday.in/,http://www.univarta.com/,http://www.swatantraawaz.com/,http://raviwar.com/,http://ianshindi.com/,http://www.pressnote.in/,http://emsindia.com/,http://hn.newsbharati.com/,http://newswing.com/,http://navsancharsamachar.com/,http://www.prativad.com/,http://www.samaylive.com/,http://www.peoplessamachar.co.in/,http://emalwa.com/,http://legendnews.in/,http://www.raftaar.in/,http://www.palpalindia.com/,http://www.bhopalsamachar.com,http://www.nationalduniya.in/,http://www.januday.com/,http://mediadarbar.com/,https://www.bhadas4media.com/,http://mediakhabar.com/,http://newswing.com/,http://www.thehinditimes.com/,http://www.dw.com/hi/%E0%A4%96%E0%A4%AC%E0%A4%B0%E0%A5%87%E0%A4%82/s-11931,http://www.dastaknews.com/,http://www.hastakshep.com/,http://www.janadesh.in/,http://www.garjachhattisgarhnews.com/,http://www.emstv.in/,http://www.yashbharat.com/,http://www.dudhwalive.com/,https://www.pravakta.com/,http://www.agrasamachar.com/,https://www.newstracklive.com/,http://poorvanchalmedia.com/,http://teznews.com/,http://www.samacharplus.com/,http://www.janjagranmediamanch.com/,http://hindi.thatscricket.com/,http://uppatrika.com/,https://www.puridunia.com/,http://tahlkanews.com/,http://hindi.irib.ir/,https://www.sabguru.com/,http://www.newsstate.com/,http://epatrakar.com/,https://hindi.sportskeeda.com/,http://www.24hindinews.com/,http://www.wahcricket.com/,https://tosnews.com/,http://ghamasan.com/,http://todaynews18.com/,http://www.alivenews.co.in/,http://hindi.apnlive.com/,https://www.journalistcafe.com/,http://jaianndata.com/";
        String[] urls = newUrlEn.split(",");
//        String[] urls2 = hindiUrl2.split(",");
        Set<String> urlSet = new HashSet<>();
        int index = 1;
        for(String uuu: urls) {
            urlSet.add(index + "`" + uuu);
            index++;
        }
/*        for(String uuu: urls2) {
            urlSet.add(uuu);
        }*/
        ExecutorService executorService = Executors.newFixedThreadPool(5);
        for(String u: urlSet) {
            /*if (!a(u, "http://")) {
                if (!a(u, "https://")) {
                    if (!a(u, "http://www.")) {
                        if (!a(u, "https://www.")) {
                            System.out.println(u + "`" + "error");
                        }
                    }
                }
            }

        }*/
            executorService.execute(new Runnable() {
                @Override
                public void run() {
                    if(!a(u,"")){
                        System.out.println(u + "`" + "error");
                    }
                    /*if(!a(u,"http://")) {
                        if(!a(u,"https://")){
                            if(!a(u,"http://www.")) {
                                if(!a(u,"https://www.")){
                                    System.out.println(u + "`" + "error");
                                }
                            }
                        }
                    }*/
                }
            });
        }
        executorService.shutdown();
    }

    public static Boolean a(String u, String prefix) {
        try{
            String[] urlSplits = u.split("`");
            String url = prefix + urlSplits[1];
            Document doc = Jsoup.connect(url)
                    .userAgent("Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; MALC)")
                    .timeout(20000)
                    .get();
//            Element head = doc.head();
            String data = doc.toString().toLowerCase();
//            StringBuffer platform = new StringBuffer();
            if(data.indexOf("taboola") > -1 && data.indexOf("outbrain") > -1) {
//                platform.append("");
                System.out.println(u + "`" + "taboola,outbrain");
                return true;
            }else if(data.indexOf("taboola") > -1){
                System.out.println(u + "`" + "taboola");
//                platform.append("taboola,");
                return true;
            }else if(data.indexOf("outbrain") > -1){
                System.out.println(u + "`" + "outbrain");
//                platform.append("outbrain,");
                return true;
            }else if(data.indexOf("colombia") > -1){
//                platform.append("colombia,");
                System.out.println(u + "`" + "colombia");
                return true;
            }else if(data.indexOf("revcontent") > -1){
//                platform.append("revcontent,");

                System.out.println(u + "`" + "revcontent");
                return true;
            }else if(data.indexOf("mgid") > -1){
//                platform.append("mgid,");
                System.out.println(u + "`" + "mgid");
                return true;
            }else{
                Elements links = doc.body().getElementsByTag("a");
                Iterator<Element> link = links.iterator();
                int index = 0;
                Boolean get = false;
                while (link.hasNext()) {
                    if(index > 60)
                        break;
                    String linkUrl = link.next().attr("href");
                    if(linkUrl.indexOf(u) < 0)
                        continue;
                    index++;
                    if(index < 30)
                        continue;
                    Document tmp = Jsoup.connect(linkUrl)
                            .userAgent("Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; MALC)")
                            .timeout(10000)
                            .get();
                    String dataTmp = tmp.toString().toLowerCase();
                    if(dataTmp.indexOf("taboola") > -1 && dataTmp.indexOf("outbrain") > -1) {
                        System.out.println(u + "`" + "taboola,outbrain");
                        get = true;
                        break;
                    }else if(dataTmp.indexOf("taboola") > -1){
                        System.out.println(u + "`" + "taboola");
                        get = true;
                        break;
                    }else if(dataTmp.indexOf("outbrain") > -1){
                        System.out.println(u + "`" + "outbrain");
                        get = true;
                        break;
                    }else if(data.indexOf("colombia") > -1){
//                platform.append("colombia,");
                        System.out.println(u + "`" + "colombia");
                        get = true;
                        break;
                    }else if(data.indexOf("revcontent") > -1){
//                platform.append("revcontent,");
                        System.out.println(u + "`" + "revcontent");
                        get = true;
                        break;
                    }else if(data.indexOf("mgid") > -1){
//                platform.append("mgid,");
                        System.out.println(u + "`" + "mgid");
                        get = true;
                        break;
                    }
                }
                if(!get) {
                    System.out.println(u + "`" + " ");
                }
                return true;
            }
        }catch (Exception e) {
            return false;
        }
    }
}
